*** prepare DNA data from Cavalli-Sforza

clear
set more off

local path0="C:\Dropbox\GeneticsProject (1)\REStat\round_accepted_replication_files\"

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
insheet using "`path0'Data\CavalliSforza\Country_codes.txt", names clear
save "`path0'Data\CavalliSforza\Country_codes.dta", replace

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

insheet using "`path0'Data\CavalliSforza\c-s condense - RAW.txt", names clear


*** drop A+/A- allele info
drop if allele=="A1" |  allele=="A2"

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** 	create countries for countries that split into sveral countries: USSR
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
replace state = "Armenia" if state=="Ussr" & region=="Armeniya"

replace state = "Azerbaijan" if state=="Ussr" & region=="Azerbaidzhan Ssr"

replace state = "Belarus" if state=="Ussr" & region=="Belorussiya"
replace state = "Belarus" if state=="Ussr" & region=="Minsk"

replace state = "Georgia" if state=="Ussr" & region=="Georgia"
replace state = "Georgia" if state=="Ussr" & region=="Georgia Imereti"
replace state = "Georgia" if state=="Ussr" & region=="Ordzhonikidze As"

replace state = "Estonia" if state=="Ussr" & region=="Estonia"

replace state = "Kazakhstan" if state=="Ussr" & region=="Alma Ata Oblast"
replace state = "Kazakhstan" if state=="Ussr" & region=="Kazakhstan"

replace state = "Kyrgyzstan" if state=="Ussr" & region=="Issyk Kul''Oblast"
replace state = "Kyrgyzstan" if state=="Ussr" & region=="Kirghizkaja Assr"

replace state = "Latvia" if state=="Ussr" & region=="Latvia"
* replace population="Latvian" if population=="Russian" & state=="Latvia" 

replace state = "Lithuania" if state=="Ussr" & region=="Lithuania"
*replace population="Lithuanian" if population=="Russian" & state=="Lithuania" 


replace state = "Moldova" if state=="Ussr" & region=="Moldaviya"

replace state = "Tajikistan" if state=="Ussr" & region=="Kurgan Tyube Dis"
replace state = "Tajikistan" if state=="Ussr" & region=="Pamir"
replace state = "Tajikistan" if state=="Ussr" & region=="Pyandzh"
replace state = "Tajikistan" if state=="Ussr" & region=="Rushan"
replace state = "Tajikistan" if state=="Ussr" & region=="Samarkand Distri"
replace state = "Tajikistan" if state=="Ussr" & region=="Tadzhikistan"
replace state = "Tajikistan" if state=="Ussr" & region=="Vakhanskiy Dhreb"

replace state = "Turkmenistan" if state=="Ussr" & region=="Turkmenistan"

replace state = "Ukraine" if state=="Ussr" & region=="Azov Black Sea"
replace state = "Ukraine" if state=="Ussr" & region=="Ukraina"
replace pop_code=72401 if pop_code==72403 & state=="Ukraine" 
replace population="Ukrainian" if population=="Russian" & state=="Ukraine" 


replace state = "Uzbekistan" if state=="Ussr" & region=="Surkhan Dar''Insk"
replace state = "Uzbekistan" if state=="Ussr" & region=="Uzbekistan"

replace state = "Russia" if state=="Ussr"

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** 	create countries for countries that split into sveral countries: Czechoslovakia
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

replace state = "CzechRepublic" if state=="Czecoslovakia" & region=="Bohemia"
replace state = "CzechRepublic" if state=="Czecoslovakia" & region=="Central"
replace state = "CzechRepublic" if state=="Czecoslovakia" & region=="Moravia"
replace state = "CzechRepublic" if state=="Czecoslovakia" & region=="Praha"
replace state = "CzechRepublic" if state=="Czecoslovakia" & subregion=="Ostrava"
replace state = "CzechRepublic" if state=="Czecoslovakia" & subregion=="Hlucinsko"
replace state = "CzechRepublic" if state=="Czecoslovakia" & region==""

replace state = "Slovakia" if state=="Czecoslovakia"


*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** 	create countries for countries that split into sveral countries: Yugoslavia
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

replace state = "Croatia" if state=="Yugoslavia" & region=="Croatia"
replace state = "Croatia" if state=="Yugoslavia" & region=="Dalmatian Coast"
replace state = "Croatia" if state=="Yugoslavia" & subregion=="Osijek"
replace state = "Croatia" if state=="Yugoslavia" & subregion=="Rijeka"

replace state = "Slovenia" if state=="Yugoslavia" & region=="Slovenija"
replace population="Slovenians" if state == "Slovenia" 

replace state = "BosniaHerzegovina" if state=="Yugoslavia" & region=="Bosniahercegovin"

replace state = "Serbia-Montenegro" if state=="Yugoslavia" & region=="Beograd"
replace state = "Serbia-Montenegro" if state=="Yugoslavia" & region=="Serbia"
replace state = "Serbia-Montenegro" if state=="Yugoslavia" & region=="Montenegro"

replace state = "Macedonia" if state=="Yugoslavia" & region=="Makedonija"


*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** 	modern/English names of states
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

replace state = "Benin" if state=="Dahomey" 
replace state = "Cameroon" if state=="Cameroun" 
replace state = "Myanmar" if state=="Burma" 
replace state = "Papua New Guinea" if state=="New Guinea" 
replace state = "Burkina Faso" if state=="Upper Volta" 
replace state = "Congo, Dem Rep" if state=="Zaire" 
replace state = "Mozambique" if state=="Mocambique" 
replace state = "Rwanda" if state=="Ruanda" 


*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** 	compute average frequencies of blood types across studies
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
egen sample_size_wgt=sum(sample_size), by(state allele population)
egen temp_allele=sum(sample_size*freq), by(state allele population)
gen temp_sample_size=sample_size/sample_size_wgt

gen allele_wgt=temp_allele/sample_size_wgt

gen temp_allele_sd=temp_sample_size*(freq-allele_wgt)^2
egen temp_allele_sd2=sum(temp_allele_sd), by(state allele population)
gen allele_wgt_sd=(temp_allele_sd2)^0.5




drop temp_*


collapse (first)  continent region subregion latitude longitude ///
			pop_description pop_code ///
			sample_size freq ///
			sample_size_wgt allele_wgt allele_wgt_sd, ///
			by(state allele population)


compress
save "`path0'Workfiles\DNAdata.dta", replace



*=====================================================================
* 	Additional data not listed in C-S but available in other sources
*=====================================================================

insheet using "`path0'Data\CavalliSforza\Data for additional groups.txt", names clear

egen sample_size_wgt=sum(sample_size), by(allele pop_code)
egen temp_allele=sum(sample_size*freq), by(allele pop_code)
gen temp_sample_size=sample_size/sample_size_wgt

gen allele_wgt=temp_allele/sample_size_wgt

gen temp_allele_sd=temp_sample_size*(freq-allele_wgt)^2
egen temp_allele_sd2=sum(temp_allele_sd), by(allele pop_code)
gen allele_wgt_sd=(temp_allele_sd2)^0.5


drop temp_*

collapse (first)  continent region subregion latitude lon* ///
			pop_description population ///
			sample_size freq ///
			sample_size_wgt allele_wgt allele_wgt_sd, ///
			by(state allele pop_code)

drop if pop_code==.
drop region pop_description

replace population="Gypsy" if state=="Hungary" & population=="Gypsy/Roma"
replace population="Roma" if state=="Spain" & population=="Gypsy/Roma"
replace population="Roma" if state=="Romania" & population=="Gypsy/Roma"
replace population="Roma" if state=="Slovakia" & population=="Gypsy/Roma"
replace population="Roma" if state=="CzechRepublic" & population=="Gypsy/Roma"
replace population="Roma" if state=="Greece" & population=="Gypsy/Roma"
replace population="Roma" if state=="Macedonia" & population=="Gypsy/Roma"

save "`path0'Workfiles\Data_for_additional_groups.dta", replace
			
*=====================================================================
use "`path0'Workfiles\DNAdata.dta", clear
append using "`path0'Workfiles\Data_for_additional_groups.dta"
compress
replace freq=allele_wgt
save "`path0'Workfiles\DNAdata.dta", replace


		
